require(data.table)
require(reshape2)
library(plyr)

load("02_07_m5s_join_user_activity_df.RData")
load("02_08_m5s_unique_user_activity_df.RData")

unique_user_activity_df <- subset(unique_user_activity_df, grepl('^univ', universal_id))

# Some Facebook user has changed name this creates problems with the unicity of the key
dup <- duplicated(subset(unique_user_activity_df, select=c("source","label_id")))
dup1 <- duplicated(subset(unique_user_activity_df, select=c("source","label_id")), fromLast=TRUE)
df <- unique_user_activity_df[dup | dup1,]
dup_id <- unique(df$label_id)
#View(df[order(df$label_id),])

dict <- data.frame()
for (id in dup_id) {
  indices <- df$label_id == id
  univ_id <- df$universal_id[indices]
  dict <- rbind(dict,
                data.frame(value = univ_id,
                           replace_with = univ_id[1])) 
}
bool <- unique_user_activity_df$universal_id %in% dict$value
unique_user_activity_df$universal_id[bool] <- 
  with(dict, replace_with[match(unique_user_activity_df$universal_id[bool], value)])

unique_user_activity_df <- unique(unique_user_activity_df)

join_user_activity_df$source_micro <- join_user_activity_df$source
join_user_activity_df$source <- revalue(join_user_activity_df$source,
                                        c("bg_facebook"="facebook", "m5s_facebook"="facebook",
                                          "pf_facebook"="facebook"))


setkeyv(join_user_activity_df, c("source","label_id"))
setkeyv(unique_user_activity_df, c("source","label_id"))

join_user_activity_df_univ_only <- merge(join_user_activity_df, unique_user_activity_df)

rm(join_user_activity_df)

                                        # Gender analysis
# Load dictionary and prepate
wiktionary_name_gender <- read.csv("italian_names_by_gender.csv", header=FALSE)
wiktionary_name_gender <- unique(wiktionary_name_gender)
wiktionary_name_gender <- wiktionary_name_gender[
                                              (!duplicated(wiktionary_name_gender$V1) &
                                               !duplicated(wiktionary_name_gender$V1, fromLast = TRUE))
                                                            ,]

tmp <- data.frame(name=unique(join_user_activity_df_univ_only$name),
                  gender= genderAttribution(unique(join_user_activity_df_univ_only$name), wiktionary_name_gender))
tmp <- data.table(tmp)

setkey(tmp, "name")
setkey(join_user_activity_df_univ_only, "name")

join_user_activity_df_univ_only <- merge(join_user_activity_df_univ_only, tmp, all.x=TRUE)

setkey(join_user_activity_df_univ_only, "universal_id")

save(join_user_activity_df_univ_only, file="02_09_m5s_join_user_activity_df_univ_only.RData")

sum_stats <- list()
sum_stats[['tot_activities']] <- nrow(join_user_activity_df_univ_only)
sum_stats[['tot_users']] <- length(unique(join_user_activity_df_univ_only$universal_id))
sum_stats[['activity_table']] <- table(join_user_activity_df_univ_only$activity)
sum_stats[['unique_users_blog']] <-
    length(unique(subset(unique_user_activity_df,  grepl('blog', source))$universal_id))
sum_stats[['unique_users_forum']] <-
    length(unique(subset(unique_user_activity_df,  grepl('forum', source))$universal_id))
sum_stats[['unique_users_fb']] <-
    length(unique(subset(unique_user_activity_df,  grepl('facebook', source))$universal_id))
sum_stats[['unique_users_meetup']] <-
    length(unique(subset(unique_user_activity_df,  grepl('meetup', source))$universal_id))

ind <- setDT(join_user_activity_df_univ_only)[,.I[!duplicated(source)], universal_id]$V1 

order_of_activities <- dcast(join_user_activity_df_univ_only[ind][order(date),action:= paste0('action', 1:.N) , universal_id], universal_id~action, value.var='source')

save(sum_stats, file="02_09_m5s_join_univ_sum_stats.RData")
save(order_of_activities, file="02_09_m5s_join_behav_order_activities.RData")



